Load

Load data from ~/Orthonectida/Projects/Biostatistic/Biostat_2022/scripts/CharityHospital_R_2022-11-02_1448.r

Classified vars

# Base vars:
base_vars.factor = c('gender.factor', 'age.group',
              # Docs
              'id_status.factor.reason', 'oms_status.factor.reason', 'sn_status.factor.reason',
              # Additctions
              'nicotin.factor', 'alcogolic.factor', 'narco.factor', 'ne_narco.factor')

base_vars.bool = c('id_status.factor.bool', 'oms_status.factor.bool', 'sn_status.factor.bool')

quant_vars = c('ObsNum', 'age.actual')


# Dummy variables:
dummy_vars_raw = c('Observation', 'Homeless',
               'ds_icd_1.factor', 'ds_icd_2.factor', 'ds_icd_3.factor', 
               'etest_hiv.factor', 'etest_hbsag.factor', 'etest_hcv.factor', 'etest_lues.factor', 'etest_covid19.factor')
dummy_vars_already <- c('complaint_lite')

# Dinamic variables
dinamic_vars = c('Observation', 'Homeless')

Transform data

## Transform BirthDate to actual age (26.11.2022)
data$age.actual[!is.na(data$date_bd)] <- 
  age_calc(as.Date(data$date_bd[!is.na(data$date_bd)]),
           Sys.Date(),
           units = 'years') %>%
  floor

# Age group
data <-
  data %>%
  dplyr::mutate(
    age.group = case_when(
      age.actual < 18 ~ "<18 (несовершеннолетние)",
      age.actual >= 18 & age.actual < 45 ~ "18-44 (молодой возраст)",
      age.actual >= 45 & age.actual < 60 ~ "45-59 (средний возраст)",
      age.actual >= 60 & age.actual < 75 ~ "60-74 (пожилой возраст)",
      age.actual >= 75 ~ "75+ (старческий возраст)"
    )
  )
data$age.group <- factor(data$age.group, levels = sort(unique(data$age.group)))

Processing of each type of vars

# Dinamic vars (should be processed before dummy)
data_dinamic <-
  data %>%
    dplyr::select(record_id, dinamic_vars) %>%
    dplyr::group_by(record_id) %>%
    dplyr::summarise_all(function(x) paste(x, collapse = '|')) %>%
    dplyr::rename_all(function(x) ifelse(x != 'record_id', paste0(x, '.dinamic'), x)) %>%
    dplyr::mutate_at(vars(ends_with('.dinamic')), 
                     function(x) first_last_dinamics(x) %>% replace_na('Нет данных') %>% as.factor)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(dinamic_vars)
## 
##   # Now:
##   data %>% select(all_of(dinamic_vars))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Renew names of dinamic vars with dinamic suffix
dinamic_vars = paste0(dinamic_vars, '.dinamic')

# Proccess dummy vars (drop original dummy vars)
data_dummy <-
  data %>%
  dummy_cols(remove_first_dummy = F, ignore_na = T,
             select_columns = dummy_vars_raw) %>%
  dplyr::select(-dummy_vars_raw) %>% # remove original vars
  # Rename dummy vars already (for further join with original df)
  dplyr::rename_at(vars(starts_with(dummy_vars_already)), ~paste0('.',.)) %>%
  dplyr::select(record_id, starts_with(c(dummy_vars_raw, paste0('.', dummy_vars_already)), ignore.case = F)) %>% # select new vars + already dumm vars
  dplyr::group_by(record_id) %>%
  dplyr::summarise_all(function(x) sum(x, na.rm =T)) %>%
  dplyr::ungroup() %>%
  dplyr::select_if(function(x) sum(x, na.rm = T) != 0) # drop zero sum
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(dummy_vars_raw)
## 
##   # Now:
##   data %>% select(all_of(dummy_vars_raw))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Add new dummied variables to list of quant vars
dummy_quant_vars = data_dummy %>% colnames %>% .[-1] # drom record_id

# Process quantative vars (among observation)
data_collapse <-
  data %>%
    dplyr::group_by(record_id) %>%
    dplyr::summarise(ObsNum = ifelse(n() == 1, 1, n() - 1))

# Rowwise vars
data_rowwise <- 
  data %>%
  dplyr::filter(is.na(Observation)) %>% # Keep only informative rows
  dplyr::select(record_id) %>%
  cbind(sum_rowwise_vars(data, 'ch_ds', 'отрицает')) # bind ID with rowwise vars

# Add rowwise new variables to quant
quant_vars = c(quant_vars, data_rowwise %>% dplyr::select(ends_with('.number')) %>% colnames)

Basic stats

selected_vars <- c(base_vars.bool, base_vars.factor, quant_vars, dummy_quant_vars, dinamic_vars)
##
data_done <-
  data %>%
  # filter out rows without base info
  dplyr::filter(is.na(redcap_repeat_instrument)) %>% 
  # Join with summed dummy vars
  right_join(data_dummy, by = 'record_id') %>%
  # Join with quant vars
  right_join(data_collapse, by = 'record_id') %>%
  # Join with rowwise vars
  right_join(data_rowwise, by = 'record_id') %>% 
  # Join with dinamic vars
  right_join(data_dinamic, by = 'record_id') %>%
  dplyr::select(record_id, all_of(selected_vars))

HandMade analyses

AgeGender

# record_id, gender.factor, age.group,
#                 Homeless.dinamic, id_status.factor.bool,
#                 oms_status.factor.bool, sn_status.factor.bool,
#                 ObsNum


ggGender1 <-
  data_done %>%
  dplyr::filter(!is.na(gender.factor) & !is.na(age.group)) %>%
  dplyr::group_by(age.group, gender.factor) %>%
  count %>%
  ggplot(aes(n, fct_rev(age.group), fill = gender.factor )) +
  geom_bar(stat = 'identity', position = 'dodge') +
  labs(x = 'Patients', y = 'Age group', fill = 'Gender') +
  scale_fill_manual(values = wes_palettes$Rushmore1[c(3,4)]) +
  theme_bw() +
  theme(axis.title = element_text(face = 'bold', size = 14),
        legend.title = element_text(face = 'bold', size = 14),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12))
ggGender1

ggGender2 <-
  data_done %>%
  dplyr::mutate(age.bins = cut(data_done$age.actual, 15),
                count = 1) %>%
  dplyr::select(age.bins, count, gender.factor) %>%
  aggregate(count ~ gender.factor + age.bins, data = ., length) %>%
  dplyr::mutate(count = ifelse(gender.factor == 'мужской', count * -1, count)) %>%
  #Plot
  ggplot(aes(age.bins, count, fill = gender.factor)) +
  geom_bar(stat = 'identity') +
  facet_share(~gender.factor, dir = 'h', scales = 'free', reverse_num = T) +
  coord_flip() +
  labs(y = 'Age', x = 'Patients', fill = 'Sex') +
  scale_fill_manual(values = wes_palettes$Rushmore1[c(3,4)]) +
  theme_bw() +
  theme(axis.title = element_text(face = 'bold', size = 14),
        legend.title = element_text(face = 'bold', size = 14),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12),
        strip.text.x = element_text(face = 'bold', size = 14)) 
ggGender2

Home

Home overall

data %>%
  dplyr::filter(!is.na(redcap_repeat_instrument)) %>%
  dplyr::select(record_id, redcap_repeat_instrument, Homeless) %>%
  dplyr::summarise(Count = table(Homeless, exclude = NULL),
                   Homeless = names(table(Homeless, exclude = NULL))) %>%
  dplyr::mutate(Homeless = case_when(is.na(Homeless) ~ 'нет данных',
                                     T ~ Homeless)) %>%
  # Plot 
  ggplot(aes(Count, fct_reorder(Homeless, Count) , fill = Homeless)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = wes_palettes$Rushmore1[2:5]) +
  theme_bw() +
  theme(axis.title = element_text(face = 'bold', size = 14),
        legend.title = element_text(face = 'bold', size = 14),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12),
        legend.position = 'none') +
  labs(x  = 'Количество посещений', y = 'Тип бездомности')
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.

Home by Patients

ggHome <-
  data_done %>%
  dplyr::filter(!is.na(Homeless.dinamic)) %>%
  dplyr::group_by(Homeless.dinamic) %>%
  count %>% 
  ggplot(aes(n, fct_reorder(Homeless.dinamic, desc(n)))) +
  geom_bar(stat = 'identity') +
  # Appereance
  labs(x = 'Patients', fill = 'Age group', y = 'Home status') +
  scale_fill_manual(values = rev(wes_palettes$Rushmore1)) +
  theme_bw() +
  theme(axis.title = element_text(face = 'bold', size = 14),
        legend.title = element_text(face = 'bold', size = 14),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12))
ggHome

# Parlament plot
df_count <-
  data_done %>%
  dplyr::filter(!is.na(Homeless.dinamic)) %>%
  dplyr::group_by(Homeless.dinamic) %>%
  count %>%
  dplyr::mutate(Homeless.dinamic = as.character(Homeless.dinamic)) %>%
  dplyr::arrange(desc(n))

# Plot
ggplot(df_count) +
  geom_parliament(aes(seats = n, fill = Homeless.dinamic)) +
  scale_fill_manual(values = c('darkgrey', 
                               "black", "blue", "lightblue", "yellow", 
                               "red","purple", "green",
                               'orange', 'tomato4'), 
                    labels = df_count$Homeless.dinamic) +
  coord_fixed() + 
  labs(fill = 'Тип бездомности') +
  theme(axis.title = element_text(face = 'bold', size = 14),
        legend.title = element_text(face = 'bold', size = 14),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12)) +
  theme_void()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.

AgeAlcoNarco

  data_done %>%
  dplyr::select(Homeless.dinamic, nicotin.factor, alcogolic.factor, narco.factor, ne_narco.factor) %>%
  # dplyr::filter(!is.na(age.group)) %>%
  dplyr::group_by(Homeless.dinamic) %>%
  dplyr::summarise(answer = names(table(alcogolic.factor)), # all factors - same value and order
                   Алко = table(alcogolic.factor),
                   Никотин = table(nicotin.factor),
                   Нарко = table(narco.factor),
                   # narco.factor = names(table(narco.factor)),
                   НеНарко = table(ne_narco.factor)
                   # ne_narco.factor = names(table(ne_narco.factor)) 
                   ) %>%
  dplyr::ungroup() %>%
  melt %>%
  dplyr::mutate(value = as.numeric(value))  %>%
  dplyr::group_by(Homeless.dinamic) %>% dplyr::mutate(Sum = sum(value)) %>% dplyr::ungroup() %>%
  
  ggplot(aes(value, answer, fill = Homeless.dinamic)) +
  geom_bar(stat = 'identity') +
  facet_grid(cols = vars(variable)) +
  # Appereance
  labs(x = 'Patients', fill = 'Home status', y = 'Answer') +
  # scale_fill_manual(values = c(wes_palettes$BottleRocket2, wes_palettes$BottleRocket1)) +
  theme_bw() +
  theme(axis.title = element_text(face = 'bold', size = 14),
        legend.title = element_text(face = 'bold', size = 14),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12),
        strip.text.x = element_text(face = 'bold', size = 14))

tmp = data$citizen.factor %>% table
countries.citizen <- tmp[tmp != 0] %>% 
  as.data.frame %>% setNames(c('Answer', 'Freq')) %>%
  ggplot(aes(Freq, fct_reorder(Answer, Freq))) +
  geom_bar(stat = 'identity', fill = wes_palette("Royal2")[3]) +
  theme_bw() +
  labs(x = 'Количество пациентов (по посещениям)', y = 'Страны (ответ со слов)') + 
  theme(axis.title = element_text(face = 'bold', size = 11),
        legend.title = element_text(face = 'bold', size = 11),
        axis.text = element_text(size = 10),
        legend.text = element_text(size = 10))

ggplotly(countries.citizen)
countries.citizen

Social diseases

# Можно добавить какие-то переменные:

# only in one fiela (is.na(redcap_repeat_instrument) == T):
soc_ds.answer = c('hiv_1.factor', 'hb_1.factor', 'hc_1.factor', 'lues.factor', 'artv.factor') 
# Could be in each observation:
soc_ds.test = c('etest_hiv.factor', 'etest_hbsag.factor', 'etest_hcv.factor', 
                'etest_lues.factor', 'etest_covid19.factor') 

data_test_dinamic <-
  data %>%
  dplyr::select(record_id, soc_ds.test) %>%
  na_if('не тестировался') %>% # replace "не тестировался" на NA
  group_by(record_id) %>%
    # remove all NAs, separeate each test result for each patients by "|"
  dplyr::summarise_all(function(x) ifelse(all(is.na(x)), NA, paste(x[!is.na(x)], collapse = '|'))) %>%
  dplyr::mutate_at(vars(starts_with('etest')), first_last_dinamics)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(soc_ds.test)
## 
##   # Now:
##   data %>% select(all_of(soc_ds.test))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
    # Patients for checking (hiv):   
    # 1174 ++
    # 245 -- 
    # 1333 -+
  
# Join with answer
data %>%
    dplyr::filter(is.na(redcap_repeat_instrument)) %>% # keep only informative fields
    dplyr::select(record_id, soc_ds.answer) %>%
    left_join(data_test_dinamic, by = 'record_id') %>%
  #TODO
  dplyr::filter(record_id %in% c(245, 1174, 1333)) #TEMP checking:
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(soc_ds.answer)
## 
##   # Now:
##   data %>% select(all_of(soc_ds.answer))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
##   record_id  hiv_1.factor   hb_1.factor hc_1.factor lues.factor artv.factor
## 1       245      отрицает      отрицает    отрицает    отрицает        <NA>
## 2      1174 есть, со слов есть, со слов    отрицает    отрицает         нет
## 3      1333      отрицает      отрицает    отрицает    отрицает        <NA>
##                 etest_hiv.factor etest_hbsag.factor etest_hcv.factor
## 1                  отрицательный      отрицательный    отрицательный
## 2                  положительный      отрицательный   неопределенный
## 3 отрицательный -> положительный      отрицательный    отрицательный
##                etest_lues.factor etest_covid19.factor
## 1                  отрицательный                 <NA>
## 2                  отрицательный                 <NA>
## 3 отрицательный -> положительный        отрицательный

ICD-10: статистика

library('wesanderson')
library('ggplot2')
library('plotly')

icd_10 <- data %>% 
  dplyr::select(record_id, starts_with('ds_icd'), -ends_with('.factor')) %>%
  as.data.table %>%
  melt.data.table(id.vars = 'record_id', variable.name = "ds_icd", value.name = "illness_name") %>% 
  na.omit() %>% 
  as.data.frame %>%
  transform(illness_name = factor(illness_name,
                                  levels = c(1:19),
                                  labels = c("A00-B99", "C00-D48", "D50-D89", "E00-E90", "F00-F99", "G0-G99", "H00-H59", "H65-H75", "I00-I99", "J00-J99", "K00-K93", "L55-L59", "M00-M99", "N00-N99", "O00-O99", "Q00-Q99", "S00-T98", "V01-Y98", "Z00-Z13")))
  
#icd_10$illness_name <- as.factor(icd_10$illness_name)

icd_10.stat <- ggplot(data = icd_10, aes(x = illness_name), 
          position="dodge") +
          geom_bar(fill = wes_palette("Royal2")[5]) +
          xlab('Заболевания по МКБ-10') +
          ylab('Количество пациентов с данным заболеванием (по посещениям)') +
          theme(legend.position="left") +
          aes(stringr::str_wrap(icd_10$illness_name, 15)) + xlab(NULL) +
          theme_minimal() +
          coord_flip()
          
ggplotly(icd_10.stat)
## Warning: Use of `icd_10$illness_name` is discouraged.
## ℹ Use `illness_name` instead.
icd_10.stat
## Warning: Use of `icd_10$illness_name` is discouraged.
## ℹ Use `illness_name` instead.

ICD-10: корреляционная матрица типа бездомности и заболеваний по МКБ-10

#Создание общего столбца: icd; кодирование `Homeless` в систему 0,1,2; корреляционный анализ

#diamonds$factor_price <- ifelse(diamonds$price >= mean(diamonds$price),"1","0")
#diamonds$factor_carat <- ifelse(diamonds$carat >= mean(diamonds$carat),"1","0")
#diamods_pricaAndCarat <- table(diamonds$factor_price, diamonds$factor_carat)
#stat <- chisq.test(diamods_pricaAndCarat)
#main_stat <- c(stat$statistic)
#
#library(corrplot)
#insc_cor <- cor(insc_num)
#corrplot(insc_cor, order = 'AOE', col = COL2('RdBu', 10))
#corrplot(insc_cor, method = 'number')

ICD-10 по категориям бездомных

icd10.homeless <- data %>%
  dplyr::select(record_id, Homeless, starts_with('ds_')) %>%
  dplyr::select(-ends_with('.factor')) %>%
  as.data.table %>%
  melt.data.table(id.vars = c('record_id', 'Homeless')) %>%
  dplyr::mutate(value = as.factor(value)) %>%
  dplyr::group_by(Homeless) %>%
  dplyr::summarise(icd10 = factor(names(table(value)), 1:19),
                   Count = table(value)
                   ) %>%
  dplyr::mutate(Homeless = factor(Homeless, levels = c('уличный', 'условно уличный', 'домашний', 'нет данных'))) %>%
  
  # Plot
ggplot(aes(icd10, Count)) +
  facet_grid(rows = vars(Homeless)) +
  geom_bar(stat = 'identity', fill = wes_palette("Royal1")[4])
## `summarise()` has grouped output by 'Homeless'. You can override using the
## `.groups` argument.
icd10.homeless
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.